Each preference measure is standardized at the individual level, so that, by construction, each preference has a mean of zero and a standard deviation of one in the individual-level world sample.
Next steps: - what visualization do we want to do (why)? - always age and gender in the plots - Intercept age, intercept for age, intercept gender, intercept n
#Load packages
library(data.table)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.8 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:data.table’:
between, first, last
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
library(lmtest)
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
#Add data GPS
# Einlesen des Datensets
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")
# Anzeige der ersten Zeilen des Datensets
head(country_data)
# Determine the number of different countries
number_of_countries <- length(unique(gps_data$country))
# Display the number of different countries
number_of_countries
[1] 76
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, isocode, risktaking, gender, age)
# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))
# Display the cleaned data
cleaned_data
# Display the number of records removed per variable
records_removed_per_variable
country isocode ison region language date
0 0 0 0 0 0
id_gallup wgt patience risktaking posrecip negrecip
0 0 190 634 32 247
altruism trust subj_math_skills gender age
74 163 132 0 276
# List all variables
variable_list <- names(cleaned_data) # OR colnames(your_data)
# Display the list of variables
print(variable_list)
[1] "country" "isocode" "ison" "region" "language"
[6] "date" "id_gallup" "wgt" "patience" "risktaking"
[11] "posrecip" "negrecip" "altruism" "trust" "subj_math_skills"
[16] "gender" "age"
#select only the variables of interest
cleaned_data <- cleaned_data %>%
select(country, isocode, ison, date, risktaking, gender, age)
cleaned_data
# Liste der zu löschenden Variablen
to_be_deleted_variables <- c(region, language, id_gallup, wgt, patience, posrecip, negrecip, altruism, trust, subj_math_skills)
Error: object 'region' not found
# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)
# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)
# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)
Error in `median()`:
! `median.haven_labelled()` not implemented.
Backtrace:
1. stats::median(cleaned_data$age, na.rm = TRUE)
2. vctrs:::median.vctrs_vctr(cleaned_data$age, na.rm = TRUE)
# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)
# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
Number of Females: 36024
cat("Number of Males: ", gender_counts[2], "\n")
Number of Males: 43415
# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)
# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)
# Calculate the total participants by summing the rows
total_participants <- rowSums(gender_by_country)
# Create a data frame with country, total participants, female, and male
result_table <- data.frame(
country = rownames(gender_by_country),
total_participants = total_participants,
female = gender_by_country[, "Female"],
male = gender_by_country[, "Male"]
)
# Display the result table
result_table
# Create a table with the following information: country, isocode, n (count of participants), female percentage (%), mean age, age range, and risktaking
####### add intercept and slope (age, gender, parental status, occupational status, education)
# Intercept und Slope für Alter
intercept_age <- coef(model)["(Intercept)"]
slope_age <- coef(model)["age"]
# Intercept und Slope für Geschlecht
intercept_gender <- coef(model)["(Intercept)"]
slope_gender <- coef(model)["gender"]
summary(model)
Call:
lm(formula = risktaking ~ age, data = cleaned_data)
Residuals:
Min 1Q Median 3Q Max
-2.24386 -0.68385 -0.04386 0.65311 3.05477
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.5759701 0.0089312 64.49 <2e-16 ***
age -0.0137902 0.0001974 -69.84 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9707 on 79437 degrees of freedom
Multiple R-squared: 0.05786, Adjusted R-squared: 0.05784
F-statistic: 4878 on 1 and 79437 DF, p-value: < 2.2e-16
add intercept and slope (age, gender, parental status, occupational status, education)
# Group the data by country
table_data <- gps_data %>%
group_by(country, isocode) %>%
summarize(
n = n(),
female_percentage = mean(gender == 1) * 100,
mean_age = mean(age, na.rm = TRUE),
age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
mean_risktaking = mean(risktaking, na.rm = TRUE)
)
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
# Intercept and slope for age
intercept_age <- 0.5 # Replace with your actual intercept
slope_age <- 0.2 # Replace with your actual slope
# Intercept and slope for gender
intercept_gender <- -0.3 # Replace with your actual intercept
slope_gender <- 0.1 # Replace with your actual slope
# Intercept and slope for parental status
intercept_parental <- 0.1 # Replace with your actual intercept
slope_parental <- 0.05 # Replace with your actual slope
# Intercept and slope for occupational status
intercept_occupation <- 0.2 # Replace with your actual intercept
slope_occupation <- 0.03 # Replace with your actual slope
# Intercept and slope for education
intercept_education <- 0.4 # Replace with your actual intercept
slope_education <- 0.08 # Replace with your actual slope
# Add the intercept and slope information to the table
table_data <- table_data %>%
mutate(
intercept_age = intercept_age,
slope_age = slope_age,
intercept_gender = intercept_gender,
slope_gender = slope_gender,
intercept_parental = intercept_parental,
slope_parental = slope_parental,
intercept_occupation = intercept_occupation,
slope_occupation = slope_occupation,
intercept_education = intercept_education,
slope_education = slope_education
)
# Display the updated table
table_data
NA
cleaned_data$agecat <- cut(
cleaned_data$age,
breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
right = FALSE # Left end (inclusive), right end (exclusive)
)
# Display the new column
head(cleaned_data)
# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)
# Display the number of participants in the age categories
print(agecat_counts)
15-19 20-29 30-39 40-49 50-59 60-69 70-79 80+
6888 16872 15905 13583 11374 8570 4688 1559
# Calculate the correlation matrix with NA removal
correlation_matrix <- cor(cleaned_data[, c("patience", "risktaking", "altruism", "trust")], use = "pairwise.complete.obs")
# Print the correlation matrix
print(correlation_matrix)
patience risktaking altruism trust
patience 1.00000000 0.20573586 0.08397351 0.06404146
risktaking 0.20573586 1.00000000 0.09524941 0.03517175
altruism 0.08397351 0.09524941 1.00000000 0.16734183
trust 0.06404146 0.03517175 0.16734183 1.00000000
# Scatterplot to check for linearity
ggplot(cleaned_data, aes(x = age, y = risktaking)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatterplot for Linearity Check")
# Residual vs. Fitted plot to check for homoskedasticity
model <- lm(risktaking ~ age, data = cleaned_data)
ggplot() +
geom_point(aes(x = fitted(model), y = residuals(model))) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Fitted Values for Homoskedasticity Check")
# Q-Q plot to check for normality of residuals
qqnorm(residuals(model))
qqline(residuals(model))
# Correlation matrix to check for multicollinearity
cor(cleaned_data[c("age", "risktaking")])
age risktaking
age 1.0000000 -0.2405333
risktaking -0.2405333 1.0000000
# Durbin-Watson test for autocorrelation
dwtest(model)
Durbin-Watson test
data: model
DW = 1.5415, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
# Residual vs. Leverage plot to check for outliers
ggplot() +
geom_point(aes(x = hatvalues(model), y = residuals(model))) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Leverage for Outlier Check")
#Preparing for linear regression
# Check for missing values in 'Country' and 'Risktaking' columns
missing_country <- anyNA(cleaned_data$country)
missing_risktaking <- anyNA(cleaned_data$risktaking)
# Print the results
cat("Missing values in 'Country': ", missing_country, "\n")
Missing values in 'Country': FALSE
cat("Missing values in 'Risktaking': ", missing_risktaking, "\n")
Missing values in 'Risktaking': FALSE
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, risktaking, age)
# Split the data by country and perform linear regression for each country
regression_results <- cleaned_data %>%
group_by(country) %>%
do(model = lm(risktaking ~ age, data = .)) %>%
summarize(
country = first(country),
intercept = summary(model)$coefficients[1],
slope = summary(model)$coefficients[2],
r_squared = summary(model)$r.squared
)
# Display regression results for each country
print(regression_results)
ggplot(data = regression_results, aes(x = country, y = intercept)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
high_intercept_countries <- subset(regression_results, intercept > 0.75)
# View the countries with intercept values over 0.75
print(high_intercept_countries)
# Create scatterplots with regression lines for countries with intercept > 0.75 and smaller points
plots <- lapply(1:nrow(regression_results), function(i) {
country <- regression_results$country[i]
intercept <- regression_results$intercept[i]
if (intercept > 0.75) {
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
geom_point(size = 0.5) + # Set the size to a smaller value (e.g., 2)
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(intercept, 2)))
print(p)
}
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots for countries with intercept > 0.75 and smaller points are saved in the 'individual_country_plots' directory.")
[1] "Individual plots for countries with intercept > 0.75 and smaller points are saved in the 'individual_country_plots' directory."
regression_results <- data.frame(
country = c("Algeria", "Argentina", "Austria"),
intercept = c(0.92053422, 0.51698822, 0.42606684),
slope = c(-0.0146641801, -0.0115569623, -0.0108763042),
r_squared = c(5.232529e-02, 5.638271e-02, 3.539810e-02 )
)
# Create scatterplots with regression lines for each country
plots <- lapply(seq_len(nrow(regression_results)), function(i) {
country <- regression_results$country[i]
intercept <- regression_results$intercept[i]
slope <- regression_results$slope[i]
r_squared <- regression_results$r_squared[i]
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(intercept, 2),
"Slope:", round(slope, 2),
"R-squared:", round(r_squared, 2)))
print(p)
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots are saved in the 'individual_country_plots' directory.")
[1] "Individual plots are saved in the 'individual_country_plots' directory."
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data) # Regression over all countries
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
geom_point(size = 0.2) + # Adjust point size
geom_smooth(method = "lm", se = FALSE, size = 0.5, linetype = "solid") + # Adjust line size and type
labs(title = "Scatterplot with Regression Lines for risktaking and age by Country",
x = "Age", y = "Risktaking") +
theme(legend.position = "bottom", # Move the legend below the graph
legend.key.size = unit(0.1, "in")) # Adjust the size of the legend key
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data)
# Create a scatterplot with separate regression lines for each country
# and an overall regression line
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
geom_point(size = 0.5) + # Adjust point size
geom_smooth(method = "lm", se = FALSE, size = 0.5) + # Solid line for individual countries
geom_abline(intercept = coef(overall_lm)[1], slope = coef(overall_lm)[2],
color = "red", size = 1) + # Add the overall regression line in red
labs(title = "Scatterplot with Regression Lines for risktaking and age by Country",
x = "Age", y = "Risktaking") +
theme(legend.position = "bottom", # Move the legend below the graph
legend.key.size = unit(0.1, "in")) # Adjust the size of the legend key